#!/usr/bin/env bash

set -u
set -o pipefail

usage(){
	echo "Usage: $0 -n <deployment name> [-r <region>]"
	exit 1
}

cwd=$(dirname $(realpath $0))
source "$cwd"/fsx_setup

### Input parameters
DEPLOY_NAME="sagemaker-kfp-"$(date '+%Y-%m-%d-%H-%M-%S')"" # The name given to the entire deployment (tagging all resources)
REGION=${REGION:-"$(aws configure get region)"} # Deployment region

### Configuration parameters
EKS_EXISTING_CLUSTER=${EKS_EXISTING_CLUSTER:-""} # Use an existing EKS cluster
EKS_CLUSTER_VERSION=${EKS_CLUSTER_VERSION:-"1.22"} # EKS cluster K8s version
EKS_NODE_COUNT=${EKS_NODE_COUNT:-"2"} # The initial node count of the EKS cluster
EKS_PUBLIC_SUBNETS=${EKS_PUBLIC_SUBNETS:-""}
EKS_PRIVATE_SUBNETS=${EKS_PRIVATE_SUBNETS:-""}
SKIP_KFP_OIDC_SETUP=${SKIP_KFP_OIDC_SETUP:-"false"}
KFP_VERSION=${KFP_VERSION:-"2.0.0-alpha.7"}

### Testing parameters
MINIO_LOCAL_PORT=${MINIO_LOCAL_PORT:-9000}
KFP_NAMESPACE=${KFP_NAMESPACE:-"kubeflow"}
KFP_SERVICE_ACCOUNT=${KFP_SERVICE_ACCOUNT:-"pipeline-runner"}
AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID:-"$(aws sts get-caller-identity --query=Account --output=text)"}

PYTEST_MARKER=${PYTEST_MARKER:-""}
S3_DATA_BUCKET=${S3_DATA_BUCKET:-""}
SAGEMAKER_EXECUTION_ROLE_ARN=${SAGEMAKER_EXECUTION_ROLE_ARN:-""}
ASSUMED_ROLE_NAME=${ASSUMED_ROLE_NAME:-""}
ROBOMAKER_EXECUTION_ROLE_ARN=${ROBOMAKER_EXECUTION_ROLE_ARN:-""}

SKIP_FSX_TESTS=${SKIP_FSX_TESTS:-"false"}

ACK_RELEASE_VERSION=${ACK_RELEASE_VERSION:-"1.2.4"}
HELM_EXPERIMENTAL_OCI=1
SERVICE=sagemaker
CHART_EXPORT_PATH=/tmp/chart
CHART_REF=sagemaker-chart
TEST_EXIT_STATUS=1

while getopts ":n:r:s:" opt; do
  case $opt in
    n)
      DEPLOY_NAME="$OPTARG"
      ;;
    s)
      S3_DATA_BUCKET="$OPTARG"
      ;;
    r)
      REGION="$OPTARG"
      ;;
    \?)
      echo "Invalid option: -$OPTARG" >&2
      exit 1
      ;;
    :)
      echo "Option -$OPTARG requires an argument." >&2
      exit 1
      ;;
  esac
done

# Ensure a deployment name was specified
if [ "$DEPLOY_NAME" == "" ]; then
  echo "Missing deployment name"
  usage
  exit 1
fi

if [ "$S3_DATA_BUCKET" == "" ]; then
  echo "Missing S3 data bucket name"
  usage
  exit 1
fi

if [[ "$SKIP_FSX_TESTS" == "false" && "$EKS_PRIVATE_SUBNETS" == "" ]]; then
  echo "Missing EKS private subnets"
  usage
  exit 1
fi

function cleanup() {
  set +e

  cleanup_kfp
  # If installation fails before ack installation resources should be freed.
  if [[ -v ACK_K8S_NAMESPACE ]]; then
    uninstall_ack
  fi
  delete_assumed_role
  
  [ "${SKIP_KFP_OIDC_SETUP}" == "false" ] && delete_oidc_role

  #push to metrics to cloudwatch
  echo "Pushing Codebuild stats to Cloudwatch."
  python ../../codebuild/scripts/push_stats_to_cloudwatch.py

  if [[ "${SKIP_FSX_TESTS}" == "false" ]]; then
    delete_fsx_instance
    # Sleep in order for the security group to detach before attempting to delete it
    sleep 15s
    cleanup_fsx_security_group
  fi

  if [[ -z "${EKS_EXISTING_CLUSTER}" ]]; then
    delete_eks
  fi
}

# Set the trap to clean up resources in the case of an error
trap cleanup EXIT
set -e

function launch_eks() {
  echo "[Creating EKS] Launching EKS cluster $EKS_CLUSTER_NAME"

  eksctl_args=( --managed --nodes "${EKS_NODE_COUNT}" --node-type=c5.xlarge --region "${REGION}" --version "${EKS_CLUSTER_VERSION}" )
  [ ! -z "${EKS_PUBLIC_SUBNETS}" ] && eksctl_args+=( --vpc-public-subnets="${EKS_PUBLIC_SUBNETS}" )
  [ ! -z "${EKS_PRIVATE_SUBNETS}" ] && eksctl_args+=( --vpc-private-subnets="${EKS_PRIVATE_SUBNETS}" )

  # This command can print out non yaml output too. Take caution when upgrading eksctl.
  eksctl create cluster "${EKS_CLUSTER_NAME}" "${eksctl_args[@]}" --dry-run > generated-cluster.yaml

  yq -i ".managedNodeGroups[0].disableIMDSv1 = true" generated-cluster.yaml

  eksctl create cluster -f generated-cluster.yaml --auto-kubeconfig --timeout=60m
  rm generated-cluster.yaml

  aws eks update-kubeconfig --name "$EKS_CLUSTER_NAME" --region "$REGION"

  echo "[Creating EKS] $EKS_CLUSTER_NAME launched"
}

function delete_eks() {
  time_unit=m
  timeout=15
  retry_interval=5

  loop_counter=$timeout
  while [ "$loop_counter" -gt "0" ]; do
    eksctl delete cluster --name "$EKS_CLUSTER_NAME" --region "$REGION" --wait
    if [ $? -eq 0 ]; then
      echo "EKS cluster deleted"
      return 0
    fi
    echo "Failed to delete EKS cluster. Retrying after ${retry_interval}${time_unit}"
    loop_counter=$(($loop_counter-$retry_interval))
    sleep "${retry_interval}${time_unit}"
  done

  echo "Failed to delete EKS cluster even after trying for ${timeout}${time_unit}"
  return 1
}

function install_kfp() {
  echo "[Installing KFP] Applying KFP manifests"
  #Install cert-manager
  kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.9.1/cert-manager.yaml
  kubectl wait --for=condition=ready pod -l 'app in (cert-manager,webhook)' --timeout=180s -n cert-manager
  kubectl apply -k "github.com/kubeflow/pipelines/manifests/kustomize/env/cert-manager/cluster-scoped-resources?ref=$KFP_VERSION&timeout=90s"
  kubectl wait --for condition=established --timeout=180s crd/applications.app.k8s.io
  kubectl apply -k "github.com/kubeflow/pipelines/manifests/kustomize/env/cert-manager/dev?ref=$KFP_VERSION&timeout=90s"
  echo "[Installing KFP] Port-forwarding Minio"

  kubectl wait --for=condition=ready -n "${KFP_NAMESPACE}" pod -l app=minio --timeout=5m
  kubectl port-forward -n kubeflow svc/minio-service $MINIO_LOCAL_PORT:9000 &
  MINIO_PID=$!

  echo "[Installing KFP] Minio port-forwarded to ${MINIO_LOCAL_PORT}"

  echo "[Installing KFP] Waiting for pods to stand up"
  #TODO: In the future, modify kubectl wait to end when only one pod becomes ready.
  sleep 3m 

  kubectl wait --for=condition=ready -n "${KFP_NAMESPACE}" pod -l app=ml-pipeline --timeout=5m

  # TODO: Replace with calculated waits
  # For the moment we don't know which pods will be slower, so we are just relying on a fixed interval
  sleep 3m

  echo "[Installing KFP] Pipeline pods are ready"
}

function install_ack(){
  
  local CHART_REPO=public.ecr.aws/aws-controllers-k8s/${CHART_REF}
  local CHART_PACKAGE=${CHART_REF}-${ACK_RELEASE_VERSION}.tgz
  ACK_K8S_NAMESPACE=ack-system

  mkdir -p ${CHART_EXPORT_PATH}
  
  helm pull oci://${CHART_REPO} --version ${ACK_RELEASE_VERSION} -d ${CHART_EXPORT_PATH}
  tar xvf ${CHART_EXPORT_PATH}/${CHART_PACKAGE} -C ${CHART_EXPORT_PATH}
  
  export OIDC_ROLE_ARN
  cd ${CHART_EXPORT_PATH}/${SERVICE}-chart
  yq e '.aws.region = env(REGION)' -i values.yaml
  yq e '.serviceAccount.annotations."eks.amazonaws.com/role-arn" = strenv(OIDC_ROLE_ARN)' -i values.yaml
  cd -

  kubectl apply -f ${CHART_EXPORT_PATH}/${SERVICE}-chart/crds
  helm install -n ${ACK_K8S_NAMESPACE} --create-namespace --skip-crds ack-${SERVICE}-controller \
  ${CHART_EXPORT_PATH}/${SERVICE}-chart
  kubectl apply -f ack-rbac.yaml
}

function uninstall_ack(){
  kubectl delete trainingjob --all -n ${KFP_NAMESPACE}
  kubectl delete -f ${CHART_EXPORT_PATH}/${SERVICE}-chart/crds
  kubectl delete namespace $ACK_K8S_NAMESPACE
}

function generate_oidc_role_name() {
  OIDC_ROLE_NAME="$(echo "${DEPLOY_NAME}-kubeflow-role" | cut -c1-64)"
  OIDC_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/${OIDC_ROLE_NAME}"
}

function install_oidc_role() {
  kubectl patch serviceaccount -n ${KFP_NAMESPACE} ${KFP_SERVICE_ACCOUNT} --patch '{"metadata": {"annotations": {"eks.amazonaws.com/role-arn": "'"${OIDC_ROLE_ARN}"'"}}}'
}

function delete_oidc_role() {
  # Delete the role associated with the cluster thats being deleted
  aws iam detach-role-policy --role-name "${OIDC_ROLE_NAME}" --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
  aws iam detach-role-policy --role-name "${OIDC_ROLE_NAME}" --policy-arn arn:aws:iam::aws:policy/AWSRoboMaker_FullAccess
  aws iam delete-role-policy --role-name "${OIDC_ROLE_NAME}" --policy-name AllowAssumeRole
  aws iam delete-role-policy --role-name "${OIDC_ROLE_NAME}" --policy-name AllowPassRole
  aws iam delete-role --role-name "${OIDC_ROLE_NAME}"
}

function generate_assumed_role() {
  # If not defined in the env file
  if [[ -z "${ASSUMED_ROLE_NAME}" ]]; then
    ASSUMED_ROLE_NAME="${DEPLOY_NAME}-assumed-role"
    CREATED_ASSUMED_ROLE="true"

    # Create a trust file that allows the OIDC role to authenticate
    local assumed_trust_file="assumed-role-trust.json"
    printf '{
      "Version": "2012-10-17",
      "Statement": [
        {
          "Effect": "Allow",
          "Principal": {
            "AWS": "arn:aws:iam::'"${AWS_ACCOUNT_ID}"':root"
          },
          "Action": "sts:AssumeRole",
          "Condition": {}
        }
      ]
    }' > "${assumed_trust_file}"
    aws iam create-role --role-name "${ASSUMED_ROLE_NAME}" --assume-role-policy-document file://${assumed_trust_file} --output=text --query "Role.Arn"
    aws iam attach-role-policy --role-name ${ASSUMED_ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
    aws iam attach-role-policy --role-name ${ASSUMED_ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AWSRoboMaker_FullAccess
  fi

  # Generate the ARN using the role name
  ASSUMED_ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/${ASSUMED_ROLE_NAME}"
}

function delete_assumed_role() {
  # Ensure that the automated script created the assumed role
  if [[ ! -z "${ASSUMED_ROLE_NAME}" && "${CREATED_ASSUMED_ROLE:-false}" == "true" ]]; then
    # Delete the role associated with the cluster thats being deleted
    aws iam detach-role-policy --role-name "${ASSUMED_ROLE_NAME}" --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
    aws iam detach-role-policy --role-name "${ASSUMED_ROLE_NAME}" --policy-arn arn:aws:iam::aws:policy/AWSRoboMaker_FullAccess
    aws iam delete-role --role-name "${ASSUMED_ROLE_NAME}"
  fi
}

function cleanup_kfp() {
  # Clean up Minio
  if [[ ! -z "${MINIO_PID:-}" ]]; then
    # If this fails, deleting the nodegroup later will clean it up anyway
    kill -9 $MINIO_PID || true
  fi
  if [[ $TEST_EXIT_STATUS -gt 0 ]]; then
    kubectl delete -k "github.com/kubeflow/pipelines/manifests/kustomize/env/cert-manager/dev?ref=$KFP_VERSION&timeout=90s"
    kubectl delete -k "github.com/kubeflow/pipelines/manifests/kustomize/env/cert-manager/cluster-scoped-resources?ref=$KFP_VERSION&timeout=90s" 
  fi
}

if [[ -z "${EKS_EXISTING_CLUSTER}" ]]; then
  # Launch all of these in parallel to reduce start-up time
  EKS_CLUSTER_NAME="${DEPLOY_NAME}-eks-cluster"
  launch_eks &

  if [[ "${SKIP_FSX_TESTS}" == "false" ]]; then
    create_fsx_security_group
    create_fsx_instance
  fi

  wait
else
  aws eks update-kubeconfig --name "${EKS_EXISTING_CLUSTER}" --region "$REGION"
  EKS_CLUSTER_NAME="${EKS_EXISTING_CLUSTER}"
  DEPLOY_NAME="${EKS_EXISTING_CLUSTER}"

  if [[ "${SKIP_FSX_TESTS}" == "false" ]]; then
    create_fsx_security_group
    create_fsx_instance
  fi
  wait
fi

if [[ "${SKIP_KFP_OIDC_SETUP}" == "false" ]]; then
  generate_oidc_role_name
  "$cwd"/generate_iam_role ${EKS_CLUSTER_NAME} ${OIDC_ROLE_NAME} ${REGION} ${KFP_NAMESPACE} ${KFP_SERVICE_ACCOUNT}
fi

install_kfp
[ "${SKIP_KFP_OIDC_SETUP}" == "false" ] && install_oidc_role
generate_assumed_role
install_ack

pytest_args=( --region "${REGION}" --sagemaker-role-arn "${SAGEMAKER_EXECUTION_ROLE_ARN}" \
  --s3-data-bucket "${S3_DATA_BUCKET}" --kfp-namespace "${KFP_NAMESPACE}" \
  --minio-service-port "${MINIO_LOCAL_PORT}" --assume-role-arn "${ASSUMED_ROLE_ARN}" \
  --robomaker-role-arn "${ROBOMAKER_EXECUTION_ROLE_ARN}")

if [[ "${SKIP_FSX_TESTS}" == "true" ]]; then
  pytest_args+=( -m "not fsx_test" )
else
  # Get the VPC arguments for the FSx test
  IFS=',' read -r -a private_subnets <<< "$EKS_PRIVATE_SUBNETS"
  pytest_args+=( --fsx-subnet "${private_subnets[0]}" --fsx-security-group "${FSX_SECURITY_GROUP_ID}" --fsx-id "${FSX_ID}" )
fi

[ ! -z "${PYTEST_MARKER}" ] && pytest_args+=( -m "${PYTEST_MARKER}" )

DIR_THIS_FILE="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

cd $DIR_THIS_FILE/../
python -m pytest "${pytest_args[@]}" --junitxml ./integration_tests.xml -n 9
TEST_EXIT_STATUS=$?
